## data analysis '53 paper 
## Random Forest
rm(list = ls())

library(randomForest)

setwd("~/Dropbox/Current projects/1953/Replication archive")


## choose signal strength measure to employ
signal <- "lrm_4"

## should municipalities be dropped/recoded to account for possible jamming?
## jamming = km10, km12, km15; adjust = "drop", "min"
jamming <- ""
adjust <- ""

load(paste("1953data", signal, adjust, jamming, ".RData", sep = ""))



##########################################################
## Random Forest with cluster-bootstrapped standard errors
##########################################################
#  get model matrix
probit.out <- glm(protest ~

							county_capital +
							KVP +
							LandForstwirtschaft +
							Bergbau +
							Metallverarbeitung +
							Chemie +
							HolzKunstmassen +
							Verbrauchsgueter +
							Bauwirtschaft +
							Verkehrswesen +
							Handel +
							Dienstleistung +
							log(population) +
							prop_male +
							density +
							pop_change +
							prop_Land_same39 +
							prop_educ_8 +
							prop_educ_10 +
							prop_educ_12 +
							prop_educ_12plus +
							prop_educ_Fachschule +
							prop_educ_Hochschule +
							prop_evangelisch +
							prop_katholisch +
							prop_glaubenslos +
							living_space +
							turnout32 +
							invalid32 +
							NSDAP32 +
							SPD32 +
							KPD32 +
							Z32 +
							DNVP32 +
							DVP32 +
							signal +
							distance_to_berlin +
							as.factor(size) ,
							x = TRUE, y = TRUE, model = TRUE,
							data = dat, family = binomial(link = "probit"))

X <- model.matrix(probit.out)[,-1]
y <- as.factor(dat$protest)



set.seed(123)
rf.out <- randomForest(y = y, x = X, keep.forest = TRUE, ntree = 1000)
rf.out

d.dist <- seq(from = min(dat$distance_to_berlin), to = max(dat$distance_to_berlin),
					length.out = 20)
d.signal <- seq(from = min(dat$signal), to = max(dat$signal), length.out = 20)

out.dist <- rep(NA, length(d.dist))
out.signal <- rep(NA, length(d.signal))
names(out.dist) <- d.dist
names(out.signal) <- d.signal


m <- 1
for(m in 1:length(d.dist))	{

	X.test <- X
	X.test[,"distance_to_berlin"] <- d.dist[m]
	out.dist[m] <- mean(predict(rf.out, newdata = X.test, type = "prob")[,2])

	X.test <- X
	X.test[,"signal"] <- d.signal[m]
	out.signal[m] <- mean(predict(rf.out, newdata = X.test, type = "prob")[,2])

							}


plot(d.dist, out.dist, ylim = c(0,.5), type = "l",
	xlab = "Distance to East Berlin (km)",
	ylab = "Estimated probability of protest event", cex.lab = 1.2,
	cex.axis = 1.2, lwd = 2)


plot(d.signal, out.signal, ylim = c(0,.5), type = "l",
	xlab = "RIAS signal strength (dBuV/m)",
	ylab = "Estimated probability of protest event", cex.lab = 1.2,
	cex.axis = 1.2, lwd = 2)

## if we choose a large d we end up with identical columns because there are never
## any tree splits for a given range of signal strength; messes up Wald test



## cluster-bootstrapped SEs
clusters <- as.character(unique(dat$county))

set.seed(1)
nboots <- 1000
rf.boot.total.dist <- matrix(NA, nboots, length(d.dist))
colnames(rf.boot.total.dist) <- d.dist

rf.boot.total.signal <- matrix(NA, nboots, length(d.signal))
colnames(rf.boot.total.signal) <- d.signal



##  this code could be parallelized to speed things up...

w <- 1
for(w in 1:nboots)	{

	inds <- NULL
	sel <- sample(clusters, length(clusters), replace = TRUE)
	sel <- table(sel)	## how often is each county included in the bootstrap sample?
	times <- as.numeric(names(table(sel)))

	## now we loop over number of times counties are included in the bootstrap sample
	k <- 1
	for(k in times)	{

		## which clusters are included k times?
		sel.k <- names(sel)[sel == k]
		## add municipalities within these counties to the sample k times
		inds <- c(inds, rep(which(is.element(dat$county, sel.k) == TRUE), k))

					}

	dat.boot <- dat[inds,]


	probit.boot <- glm(protest ~

							county_capital +
							KVP +
							LandForstwirtschaft +
							Bergbau +
							Metallverarbeitung +
							Chemie +
							HolzKunstmassen +
							Verbrauchsgueter +
							Bauwirtschaft +
							Verkehrswesen +
							Handel +
							Dienstleistung +
							log(population) +
							prop_male +
							density +
							pop_change +
							prop_Land_same39 +
							prop_educ_8 +
							prop_educ_10 +
							prop_educ_12 +
							prop_educ_12plus +
							prop_educ_Fachschule +
							prop_educ_Hochschule +
							prop_evangelisch +
							prop_katholisch +
							prop_glaubenslos +
							living_space +
							turnout32 +
							invalid32 +
							NSDAP32 +
							SPD32 +
							KPD32 +
							Z32 +
							DNVP32 +
							DVP32 +
							signal +
							distance_to_berlin +
							as.factor(size) +
							as.factor(district),
							x = TRUE, y = TRUE, model = TRUE,
							data = dat.boot, family = binomial(link = "probit"))

		## we might get error messages about separation but we don't care;
		## we only need the model matrix

		X.boot <- model.matrix(probit.boot)[,-1]
		y.boot <- as.factor(dat.boot$protest)

		rf.out.boot <- randomForest(y = y.boot, x = X.boot, keep.forest = TRUE,
						ntree = 1000)


		m <- 1
		for(m in 1:length(d.dist))	{

			X.test <- X.boot
			X.test[,"distance_to_berlin"] <- d.dist[m]
			rf.boot.total.dist[w,m] <- mean(predict(rf.out.boot, newdata = X.test,
										type = "prob")[,2])

			X.test <- X.boot
			X.test[,"signal"] <- d.signal[m]
			rf.boot.total.signal[w,m] <- mean(predict(rf.out.boot, newdata = X.test,
										type = "prob")[,2])

									}

		rm(probit.boot)
		cat(w,"\n")

					}

#save.image("1953RandomForest.RData")



out2.dist <- matrix(NA,2,length(d.dist))
out2.dist[1,] <- out.dist + apply(rf.boot.total.dist,2,sd)*qnorm(.975)
out2.dist[2,] <- out.dist - apply(rf.boot.total.dist,2,sd)*qnorm(.975)

out2.signal <- matrix(NA,2,length(d.signal))
out2.signal[1,] <- out.signal + apply(rf.boot.total.signal,2,sd)*qnorm(.975)
out2.signal[2,] <- out.signal - apply(rf.boot.total.signal,2,sd)*qnorm(.975)


plot(d.dist, out.dist, ylim = c(0,.5), type = "l",
	xlab = "Distance to East Berlin (km)",
	ylab = "Estimated probability of protest event", cex.lab = 1.2,
	cex.axis = 1.2, lwd = 2)

polygon(x = c(d.dist, rev(d.dist)), y = c(out2.dist[1,], rev(out2.dist[2,])),
col = "#66666666", border = NA)
lines(d.dist, out.dist, lwd = 2)



plot(d.signal, out.signal, ylim = c(0,.5), type = "l",
	xlab = "RIAS signal strength (dBuV/m)",
	ylab = "Estimated probability of protest event", cex.lab = 1.2,
	cex.axis = 1.2, lwd = 2)

polygon(x = c(d.signal, rev(d.signal)), y = c(out2.signal[1,], rev(out2.signal[2,])),
col = "#66666666", border = NA)
lines(d.signal, out.signal, lwd = 2)





















.
